November 2018
Built with R version 3.5.1
devtools::install_github("nicolaroberts/hdp", build_vignettes = TRUE)
ggplot2reshape2gridExtratibblesurvminersurvivalglmnetRColorBrewerIRdisplaylibrary('ggplot2')
library('reshape2')
library('gridExtra')
library('tibble')
library('survminer')
library('survival')
library('glmnet')
library('RColorBrewer')
library('IRdisplay')
source('utils/tools.R') # custom tools function
source('utils/hdp_tools.R') # hdp related functions
theme_set(theme_minimal())
# set jupyer notebook parameters
options(repr.plot.res = 100, # set a medium-definition resolution for the jupyter notebooks plots (DPI)
repr.matrix.max.rows = 200, # set the maximum number of rows displayed
repr.matrix.max.cols = 200) # set the maximum number of columns displayed
Loading required package: ggpubr
Loading required package: magrittr
Loading required package: Matrix
Loading required package: foreach
Loaded glmnet 2.0-16
Run citation('hdp') for citation instructions,
and file.show(system.file('LICENSE', package='hdp')) for license details.
# get cytogenetics data
dd_cyto <- read.table("data/dd_cyto_cut15.tsv", sep = '\t', stringsAsFactors = FALSE, header = TRUE)
# get mutation data
dd_mutation <- read.table("data/dd_mutation_hotspot_cut10.tsv", sep = '\t', stringsAsFactors = FALSE, header = TRUE)
# merge cytogenetics and mutation data
cat(all(rownames(dd_mutation) == rownames(dd_cyto)), '\n\n') # check if same row ordering
dd_all <- cbind(dd_mutation, dd_cyto)
print_size(dd_all)
head(dd_all, 3)
TRUE Size of dd_all: 3300 x 157
| ARID1A | ARID2 | ASXL1 | ASXL2 | ATRX | BCOR | BCORL1 | BRAF | BRCC3 | CALR | CBL | CDKN1B | CDKN2A | CEBPA | CREBBP | CSF1R | CSF3R | CSNK1A1 | CTCF | CUX1 | DDX23 | DDX4 | DDX41 | DDX54 | DHX33 | DICER1 | DNMT3A | DNMT3B | EED | EGFR | EP300 | ETNK1 | ETV6 | EZH2 | FAM175A | FLT3 | GATA1 | GATA2 | GNAS | GNB1 | HIPK2 | IDH1 | IDH2_140 | IDH2_172 | IRF1 | JAK2 | JARID2 | KDM5C | KDM6A | KIT | KMT2A | KMT2C | KMT2D | KRAS | LUC7L2 | MGA | MPL | MYC | NF1 | NFE2 | NIPBL | NOTCH1 | NOTCH2 | NPM1 | NRAS | PAPD5 | PHF6 | PHIP | PIK3CA | PPM1D | PRPF40B | PRPF8 | PTPN11 | PTPRF | RAD21 | RAD50 | RASGRF1 | RB1 | ROBO1 | ROBO2 | RRAS | RUNX1 | SETBP1 | SETD2 | SF3B1 | SH2B3 | SMC1A | SMC3 | SMG1 | SPRED2 | SRCAP | SRSF2 | STAG2 | STAT3 | SUZ12 | TERT | TET2 | TP53 | U2AF1_157 | U2AF1_34 | U2AF2 | WT1 | YLPM1 | ZBTB33 | ZMYM3 | ZNF318 | ZRSR2 | del5q | plus8 | del7 | delY | del20q | mar | del7q | del12p | del11q | del18 | del17p | del5 | plus21 | del17 | del13 | del3p | plus1q | del16 | del20 | plus11q | plus1p | del16q | del12 | del1p | plus19 | del13q | del21 | del9q | plus1 | plus22 | plus17q | del4q | plus11 | plus14 | del3 | del3q | plus17p | r_3_3 | del11p | del14 | del15 | delX | plus9 | r_9_9 | plus13 | plus15 | r_1_7 | del22 | ring | WGA | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| I-H-132697-T1-1-D1-1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| I-H-132698-T1-1-D1-1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| I-H-116889-T2-1-D1-1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
164 patients don't have any event:
nrow(dd_all[apply(dd_all, 1, function(x) sum(x)) == 0,])
We run HDP using multiple independent posterior sampling chains and a initial number of raw cluster of 15 (initcc = 15).
All the functions used are defined in utils/hdp_tools.R and make use of Nicola Roberts HDP R package (see notebook header).
To follow the HDP terminology, in the rest of the notebook the term component is used for cluster and the term category is used for one gene/cytogenetic feature (ie we have 157 categories).
# initialise hdp
hdp <- initialise_hdp(dd_all)
Initialise HDP on a 3300 x 157 dataframe → create HDP structure... done! → add DP node for each patient... done! → assign the data to the nodes... done!
# run multiple independent posterior sampling chains with different random seeds
number_of_chains <- 3
chain_list <- vector('list', number_of_chains)
for (i in 1:number_of_chains) {
seed <- i * 100
print_and_flush(sprintf('### Experiment %d (seed = %d) ###\n', i, seed))
# run single hdp chain
chain_list[[i]] <- activate_and_run_hdp(hdp,
initcc = 15,
burnin = 1000,
n = 300,
space = 20,
seed = seed)
print_and_flush('\n')
}
multi_output <- hdp_multi_chain(chain_list)
multi_output
### Experiment 1 (seed = 100) ### Activate HDP nodes and run posterior sampling → activate HDP nodes... done! → run posterior sampling... [1] "1000 burn-in iterations in 0.2 mins" ### Experiment 2 (seed = 200) ### Activate HDP nodes and run posterior sampling → activate HDP nodes... done! → run posterior sampling... [1] "1000 burn-in iterations in 0.2 mins" ### Experiment 3 (seed = 300) ### Activate HDP nodes and run posterior sampling → activate HDP nodes... done! → run posterior sampling... [1] "1000 burn-in iterations in 0.2 mins"
Object of class hdpSampleMulti
Number of chains: 3
Total posterior samples: 900
Components: NO. Run hdp_extract_components
----------
Final hdpState from first chain:
Object of class hdpState
Number of DP nodes: 3301
Index of parent DP: 0 1 1 1 1 1 1 1 1 1 ...
Number of data items per DP: 0 9 3 5 10 1 0 4 3 6 ...
Index of conparam per DP: 1 1 1 1 1 1 1 1 1 1 ...
Conparam hyperparameters and current value:
Shape Rate Value
Conparam 1 1 1 1.602232
Number of data categories: 157
Number of clusters: 14
Initialised with 15 clusters, using random seed 100
# assess quality of posterior sampling chain
for (experiment in chains(multi_output))
plot_posterior_sampling_chain_quality(experiment, 15, 5)
# extract and plot components
multi_output <- extract_components(multi_output)
plot_components_size(multi_output)
Extract HDP components from posterior sampling → extract components... done! * 11 components found
WARNING: component 0 is a "garbage" component, not a real component.
plot_category_distribution_by_component(multi_output, colnames(dd_all))
# get top 7 categories for each component
get_top_categories_by_component(multi_output, colnames(dd_all), top_number = 7)
| component_0 | component_1 | component_2 | component_3 | component_4 | component_5 | component_6 | component_7 | component_8 | component_9 | component_10 | component_11 |
|---|---|---|---|---|---|---|---|---|---|---|---|
| ASXL1 | TET2 | TP53 | ASXL1 | SF3B1 | ASXL1 | del5q | ASXL1 | U2AF1_34 | DNMT3A | plus21 | plus1q |
| U2AF1_157 | SRSF2 | del5q | SRSF2 | TET2 | SETBP1 | SF3B1 | EZH2 | BCOR | NPM1 | plus8 | del7q |
| TET2 | ASXL1 | mar | STAG2 | DNMT3A | U2AF1_157 | DNMT3A | TET2 | DNMT3A | FLT3 | plus9 | r_1_7 |
| KMT2D | SF3B1 | del7 | RUNX1 | delY | del7 | TP53 | RUNX1 | del20q | WT1 | plus14 | ETNK1 |
| r_9_9 | CBL | del7q | TET2 | KMT2C | ETV6 | plus8 | ZRSR2 | BCORL1 | PTPN11 | TP53 | plus1 |
| PHF6 | CUX1 | del12p | IDH2_140 | KMT2D | RUNX1 | CSNK1A1 | U2AF1_157 | RUNX1 | NRAS | plus19 | ZMYM3 |
| STAT3 | DDX41 | del18 | BCOR | ZBTB33 | PTPN11 | TET2 | PHF6 | NRAS | KDM6A | plus1 | GATA1 |
Let's get the probability prediction for each patient and each component. Notice that the 164 patients with no mutation/cytogenetics are NA rows.
dd_predicted <- get_prediction_result_dataframe(multi_output, dd_all)
head(dd_predicted)
Number of components: 11 Number of NA rows : 164
| component_0 | component_1 | component_2 | component_3 | component_4 | component_5 | component_6 | component_7 | component_8 | component_9 | component_10 | component_11 | predicted_component | max_proba |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.01691358 | 0.04098765 | 0.009382716 | 0.07901235 | 0.16148148 | 0.37283951 | 0.0795061728 | 0.08716049 | 0.101728395 | 0.05086420 | 0 | 0.0001234568 | 5 | 0.3728395 |
| 0.11259259 | 0.03074074 | 0.000000000 | 0.16444444 | 0.06370370 | 0.11111111 | 0.0007407407 | 0.48851852 | 0.014814815 | 0.01333333 | 0 | 0.0000000000 | 7 | 0.4885185 |
| 0.01044444 | 0.02488889 | 0.002888889 | 0.02133333 | 0.40977778 | 0.36800000 | 0.0244444444 | 0.07177778 | 0.008444444 | 0.05777778 | 0 | 0.0002222222 | 4 | 0.4097778 |
| 0.01444444 | 0.11433333 | 0.071888889 | 0.04766667 | 0.04400000 | 0.03044444 | 0.0033333333 | 0.64288889 | 0.006555556 | 0.01188889 | 0 | 0.0125555556 | 7 | 0.6428889 |
| 0.02555556 | 0.19777778 | 0.000000000 | 0.33333333 | 0.02555556 | 0.18222222 | 0.0000000000 | 0.23555556 | 0.000000000 | 0.00000000 | 0 | 0.0000000000 | 3 | 0.3333333 |
| NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
n_components <- ncol(dd_predicted) - 2 - 1 # without columns 'component_0', 'predicted_component' and 'max_proba'
n_components
Component size in the dataset:
get_table(dd_predicted[,'predicted_component'])
| values | count | freq |
|---|---|---|
| 1 | 665 | 20.2% |
| 4 | 654 | 19.8% |
| 3 | 463 | 14% |
| 2 | 355 | 10.8% |
| 6 | 287 | 8.7% |
| 5 | 243 | 7.4% |
| 7 | 205 | 6.2% |
| NaN | 164 | 5% |
| 8 | 151 | 4.6% |
| 9 | 75 | 2.3% |
| 10 | 20 | 0.6% |
| 11 | 18 | 0.5% |
| -- total -- | 3300 | 100% |
plot_continous_feature_by_components <- function(data, feature_name, scale_y) {
set_notebook_plot_size(25, 7)
myLabeller <- function(x) {
x$predicted_component <- apply(x, 1, function(s) sprintf('Component %s (n = %d)', s, nrow(data[data$predicted_component == s,])))
return (x)
}
n_initial <- nrow(data)
data <- data[!is.na(data[feature_name]),]
p1 <- ggplot(data) + geom_boxplot(aes_string('predicted_component', feature_name, fill = 'predicted_component'), alpha = 0.7, notch = TRUE) + scale_fill_brewer(palette = 'Spectral', na.value = 'grey') +
ggtitle(sprintf('%s density by component\n(removed %d NA values)', feature_name, n_initial - nrow(data))) + theme(plot.title = element_text(size = 18, face = "bold"))
p2 <- ggplot(data) + geom_violin(aes_string('predicted_component', feature_name, fill = 'predicted_component'), alpha = 0.7) + scale_fill_brewer(palette = 'Spectral', na.value = 'grey') +
ggtitle(' \n ') + theme(plot.title = element_text(size = 18, face = "bold"))
p3 <- ggplot(data) + geom_density(aes_string(feature_name, fill = 'predicted_component'), alpha = 0.7) + scale_fill_brewer(palette = 'Spectral', na.value = 'grey') +
facet_wrap(~ predicted_component, ncol = 4, labeller = myLabeller) +
ggtitle(' \n ') + theme(plot.title = element_text(size = 18, face = "bold"))
if (scale_y) {
p1 <- p1 + scale_y_sqrt()
p2 <- p2 + scale_y_sqrt()
}
grid.arrange(p1, p2, p3, ncol = 3)
}
#suppressWarnings(plot_continous_feature_by_components(dd_clinical, 'HB', TRUE))
plot_assignement_probability_by_component(dd_predicted)
plot_assignement_probability_distribution_by_component(dd_predicted, 25, 10)
# merge patient information dataframe with predicted probability and component
dd_clustered <- cbind(dd_all, dd_predicted)
dd_clustered <- rownames_to_column(dd_clustered, var = 'patient_key')
# sort by decreseasing component size and increasing assignemnt probability
categories_table <- get_table(dd_clustered[,'predicted_component'], add_total_count = FALSE)
dd_clustered$predicted_component <- factor(dd_clustered$predicted_component, levels = categories_table$values)
dd_clustered <- dd_clustered[order(dd_clustered$predicted_component, dd_clustered$max_proba),]
print_size(dd_clustered)
head(dd_clustered, 3)
Size of dd_clustered: 3300 x 172
| patient_key | ARID1A | ARID2 | ASXL1 | ASXL2 | ATRX | BCOR | BCORL1 | BRAF | BRCC3 | CALR | CBL | CDKN1B | CDKN2A | CEBPA | CREBBP | CSF1R | CSF3R | CSNK1A1 | CTCF | CUX1 | DDX23 | DDX4 | DDX41 | DDX54 | DHX33 | DICER1 | DNMT3A | DNMT3B | EED | EGFR | EP300 | ETNK1 | ETV6 | EZH2 | FAM175A | FLT3 | GATA1 | GATA2 | GNAS | GNB1 | HIPK2 | IDH1 | IDH2_140 | IDH2_172 | IRF1 | JAK2 | JARID2 | KDM5C | KDM6A | KIT | KMT2A | KMT2C | KMT2D | KRAS | LUC7L2 | MGA | MPL | MYC | NF1 | NFE2 | NIPBL | NOTCH1 | NOTCH2 | NPM1 | NRAS | PAPD5 | PHF6 | PHIP | PIK3CA | PPM1D | PRPF40B | PRPF8 | PTPN11 | PTPRF | RAD21 | RAD50 | RASGRF1 | RB1 | ROBO1 | ROBO2 | RRAS | RUNX1 | SETBP1 | SETD2 | SF3B1 | SH2B3 | SMC1A | SMC3 | SMG1 | SPRED2 | SRCAP | SRSF2 | STAG2 | STAT3 | SUZ12 | TERT | TET2 | TP53 | U2AF1_157 | U2AF1_34 | U2AF2 | WT1 | YLPM1 | ZBTB33 | ZMYM3 | ZNF318 | ZRSR2 | del5q | plus8 | del7 | delY | del20q | mar | del7q | del12p | del11q | del18 | del17p | del5 | plus21 | del17 | del13 | del3p | plus1q | del16 | del20 | plus11q | plus1p | del16q | del12 | del1p | plus19 | del13q | del21 | del9q | plus1 | plus22 | plus17q | del4q | plus11 | plus14 | del3 | del3q | plus17p | r_3_3 | del11p | del14 | del15 | delX | plus9 | r_9_9 | plus13 | plus15 | r_1_7 | del22 | ring | WGA | component_0 | component_1 | component_2 | component_3 | component_4 | component_5 | component_6 | component_7 | component_8 | component_9 | component_10 | component_11 | predicted_component | max_proba | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1007 | E-H-110762-T1-1-D1-1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.01916667 | 0.1994444 | 0.07333333 | 0.11666667 | 0.14055556 | 0.16805556 | 0.060555556 | 0.05444444 | 0.0075000000 | 0.0319444444 | 0.09805556 | 0.03027778 | 1 | 0.1994444 |
| 941 | E-H-102643-T1-1-D1-1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.01185185 | 0.2118519 | 0.19814815 | 0.17888889 | 0.07888889 | 0.03777778 | 0.135185185 | 0.13148148 | 0.0007407407 | 0.0000000000 | 0.01518519 | 0.00000000 | 1 | 0.2118519 |
| 936 | E-H-102628-T1-1-D1-1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.03634921 | 0.2134921 | 0.04238095 | 0.06222222 | 0.20238095 | 0.10476190 | 0.007460317 | 0.15269841 | 0.0287301587 | 0.0004761905 | 0.05936508 | 0.08968254 | 1 | 0.2134921 |
# compute column separators position
col_sep <- c(cumsum(categories_table$count))
col_sep <- col_sep
# compute row separators position
row_sep <- c(58, 6, 15, 8, 11, 7, 2, 50)
row_sep_cum <- c(cumsum(row_sep))
row_sep_cum <- row_sep_cum[-length(row_sep_cum)]
# compute row colors vector
sep_color <- rev(c(rgb(180/255, 142/255, 84/255),
rgb(146/255, 187/255, 105/255),
rgb(233/255, 72/255 , 157/255),
rgb(72/255 , 169/255, 137/255),
rgb(226/255, 146/255, 102/255),
rgb(140/255, 137/255, 190/255),
rgb(238/255, 198/255, 116/255),
rgb(0/255, 0/255, 0/255)))
row_colors <- c()
for (i in 1:length(row_sep))
row_colors <- c(row_colors, rep(sep_color[i], row_sep[i]))
# manual gene grouping
col_order <- c(
'ARID1A', 'ARID2', 'BCORL1', 'BRCC3', 'CALR', 'CDKN1B', 'CDKN2A', 'CEBPA', 'CREBBP', 'CSF1R', 'CSF3R', 'CSNK1A1', 'DDX23', 'DDX4', 'DDX41',
'DDX54', 'DHX33', 'DICER1', 'DNMT3B', 'EED', 'EGFR', 'ETNK1', 'FAM175A', 'HIPK2', 'JAK2', 'JARID2', 'KDM5C', 'KMT2C', 'KMT2D', 'LUC7L2',
'MGA', 'MPL', 'NFE2', 'NIPBL', 'NOTCH2', 'NPM1', 'PAPD5', 'PHIP', 'PIK3CA', 'PRPF40B', 'PTPRF', 'RAD50', 'RASGRF1', 'RB1', 'ROBO1', 'ROBO2',
'RRAS', 'SETD2', 'SMG1', 'SPRED2', 'SRCAP', 'STAT3', 'SUZ12', 'TERT', 'YLPM1', 'ZBTB33', 'ZMYM3', 'ZNF318',
'ASXL1', 'BCOR', 'EZH2', 'RAD21', 'STAG2', 'ASXL2',
'ATRX', 'CUX1', 'GNAS', 'IRF1', 'KDM6A', 'KMT2A', 'NOTCH1', 'PHF6', 'SETBP1', 'WT1', 'SH2B3', 'SMC1A', 'EP300', 'SMC3', 'GNB1',
'BRAF', 'CBL', 'FLT3', 'KIT', 'KRAS', 'NF1', 'NRAS', 'PTPN11',
'DNMT3A', 'IDH1', 'IDH2_140', 'IDH2_172', 'TET2', 'ETV6', 'GATA1', 'GATA2', 'RUNX1', 'CTCF', 'MYC',
'SF3B1', 'SRSF2', 'U2AF1_157', 'U2AF1_34', 'U2AF2', 'ZRSR2', 'PRPF8',
'PPM1D', 'TP53',
'del1p', 'del3', 'del3q', 'del4q', 'del5', 'del5q', 'del7', 'del7q', 'del9q', 'del11p', 'del11q', 'del12', 'del12p', 'del13', 'del13q',
'del3p', 'del14', 'del15', 'del16', 'del16q', 'del17', 'del17p', 'del18', 'del20', 'del20q', 'del21', 'del22', 'delX', 'delY',
'plus1', 'plus1p', 'plus1q', 'plus8', 'plus9', 'plus11', 'plus11q', 'plus13', 'plus14', 'plus15', 'plus17p', 'plus17q', 'plus19', 'plus21', 'plus22',
'ring', 'mar', 'WGA', 'r_3_3', 'r_9_9', 'r_1_7'
)
length(col_order) # check that we have all 157 categories
# prepare and sort long datframe
long_dd_clustered <- melt(dd_clustered, id = c('patient_key', sprintf('component_%d', 0:11), 'predicted_component', 'max_proba'))
long_dd_clustered$variable <- factor(long_dd_clustered$variable, level = col_order)
long_dd_clustered$predicted_component <- factor(long_dd_clustered$predicted_component, level = categories_table$values)
long_dd_clustered <- long_dd_clustered[order(long_dd_clustered$predicted_component),]
long_dd_clustered$patient_key <- factor(long_dd_clustered$patient_key, levels = unique(long_dd_clustered$patient_key))
# set a high-definition resolution for this plot (DPI)
options(repr.plot.res = 200)
set_notebook_plot_size(10, 10)
# plot the heatmap
ggplot(long_dd_clustered, aes(patient_key, variable)) +
# geom raster heatmap
geom_raster(aes(fill = factor(value)), show.legend = FALSE) +
scale_fill_manual(values = c('0' = 'lightgrey', '1' = 'steelblue')) +
# column separation and row gene grouping (+ 0.5 to be exactly at the good location)
geom_vline(xintercept = col_sep[-length(col_sep)] + 0.5, col = 'white', linetype = 3, size = 0.3) + # [-length(col_sep)] to remove last separator
geom_hline(yintercept = row_sep_cum + 0.5, col = 'white', linetype = 3, size = 0.3) +
# 3d effect by adding white and grey horizontal line
geom_hline(yintercept = seq(1, 157, 2) + 0.5, col = 'white', linetype = 1, size = 0.03) +
geom_hline(yintercept = seq(0, 157, 2) + 0.5, col = 'darkgrey', linetype = 1, size = 0.04) +
# legend parameters
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.y = element_blank(), axis.text.y = element_blank(), plot.margin = unit(c(1, 1, 0.2, 1), 'cm')) + # top right bottom left
labs(x = 'patients') +
coord_cartesian(xlim = c(0, 3300), ylim = c(1.5, 156.5), clip = 'off') +
# colored gene label
annotate('text', x = rep(-15, 157) , y = 1:157, label = col_order, color = row_colors, size = 1.5, hjust = 1) +
annotate('text', x = rep(3315, 157), y = 1:157, label = col_order, color = row_colors, size = 1.5, hjust = 0) +
# component label
annotate('text', x = col_sep - (categories_table$count / 2), y = c(rep(159, 11), 161), label = categories_table$values, size = 3, hjust = 0.5, vjust = 0.5)
categories_table
| values | count | freq |
|---|---|---|
| 1 | 665 | 20.2% |
| 4 | 654 | 19.8% |
| 3 | 463 | 14% |
| 2 | 355 | 10.8% |
| 6 | 287 | 8.7% |
| 5 | 243 | 7.4% |
| 7 | 205 | 6.2% |
| NaN | 164 | 5% |
| 8 | 151 | 4.6% |
| 9 | 75 | 2.3% |
| 10 | 20 | 0.6% |
| 11 | 18 | 0.5% |
# create a dataframe showing the count by component for each category
categories_repartition <- data.frame(category = colnames(dd_all))
for (i in 1:n_components)
categories_repartition[sprintf('component_%d', i)] <- apply(categories_repartition, 1, function(s) sum(dd_clustered[dd_clustered$predicted_component == i, s['category']]))
# sort categories_repartition by most frequent genes
categories_repartition['total_count'] <- rowSums(categories_repartition[,-1])
categories_repartition <- categories_repartition[order(categories_repartition$total_count, decreasing = TRUE),]
head(categories_repartition)
| category | component_1 | component_2 | component_3 | component_4 | component_5 | component_6 | component_7 | component_8 | component_9 | component_10 | component_11 | total_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 97 | TET2 | 412 | 32 | 149 | 234 | 38 | 43 | 83 | 21 | 11 | 5 | 0 | 1028 |
| 3 | ASXL1 | 169 | 18 | 298 | 72 | 125 | 31 | 164 | 22 | 1 | 2 | 3 | 905 |
| 85 | SF3B1 | 101 | 16 | 42 | 410 | 23 | 114 | 10 | 15 | 5 | 0 | 0 | 736 |
| 92 | SRSF2 | 202 | 8 | 274 | 21 | 48 | 2 | 4 | 1 | 4 | 3 | 2 | 569 |
| 27 | DNMT3A | 35 | 44 | 37 | 213 | 29 | 73 | 8 | 49 | 36 | 3 | 3 | 530 |
| 108 | del5q | 21 | 217 | 12 | 11 | 13 | 166 | 2 | 3 | 7 | 9 | 0 | 461 |
# plot category distribution colored by component count
# transform (extand) categories_repartition in a long data frame by gathering the components columns in one column
long_categories_repartition <- melt(categories_repartition, id = c('category', 'total_count'))
# sort long_gene_repartition by most frequent genes
long_categories_repartition$category <- factor(long_categories_repartition$category, levels = categories_repartition$category)
long_categories_repartition <- long_categories_repartition[order(long_categories_repartition$category),]
set_notebook_plot_size(25, 7)
# count plot
ggplot(long_categories_repartition) +
geom_bar(aes(category, value, fill = variable), stat = 'identity') +
tilt_x_label(70) +
scale_fill_brewer(palette = 'Spectral', na.value = 'grey')
# proportion plot
ggplot(long_categories_repartition) +
geom_bar(aes(category, value, fill = variable), stat = 'identity', position = 'fill') +
tilt_x_label(70) +
scale_fill_brewer(palette = 'Spectral', na.value = 'grey')
# plot category distribution by component
# sort long_gene_repartition by heatmap gene order
long_categories_repartition$category <- factor(long_categories_repartition$category, levels = col_order)
long_categories_repartition <- long_categories_repartition[order(long_categories_repartition$category),]
# plot category distribution by component
set_notebook_plot_size(25, 40)
ggplot(long_categories_repartition) + geom_bar(aes(category, value, fill = variable), stat='identity', alpha = 0.8) + tilt_x_label(70) + facet_wrap(~ variable, ncol = 1, scales = 'free')
# save dd_clustered
write.table(dd_clustered, file = 'data/dd_clustered.tsv', sep = '\t')
dd_clustered <- read.table("data/dd_clustered.tsv", sep = "\t", stringsAsFactors = FALSE, header = TRUE)
dd_clustered$predicted_component <- factor(dd_clustered$predicted_component)
From Elsa:


# get clinical data
dd_clinical <- read.table('data/df_clinical_selected.tsv', sep = '\t', stringsAsFactors = FALSE, header = TRUE)
# merge with clustering data (that already contains mutation and cytogenetics data)
dd_clinical <- merge(dd_clustered, dd_clinical, by.x = 'patient_key', by.y = 'LEUKID')
print_size(dd_clinical)
head(dd_clinical, 2)
Size of dd_clinical: 3300 x 253
| patient_key | ARID1A | ARID2 | ASXL1 | ASXL2 | ATRX | BCOR | BCORL1 | BRAF | BRCC3 | CALR | CBL | CDKN1B | CDKN2A | CEBPA | CREBBP | CSF1R | CSF3R | CSNK1A1 | CTCF | CUX1 | DDX23 | DDX4 | DDX41 | DDX54 | DHX33 | DICER1 | DNMT3A | DNMT3B | EED | EGFR | EP300 | ETNK1 | ETV6 | EZH2 | FAM175A | FLT3 | GATA1 | GATA2 | GNAS | GNB1 | HIPK2 | IDH1 | IDH2_140 | IDH2_172 | IRF1 | JAK2 | JARID2 | KDM5C | KDM6A | KIT | KMT2A | KMT2C | KMT2D | KRAS | LUC7L2 | MGA | MPL | MYC | NF1 | NFE2 | NIPBL | NOTCH1 | NOTCH2 | NPM1 | NRAS | PAPD5 | PHF6 | PHIP | PIK3CA | PPM1D | PRPF40B | PRPF8 | PTPN11 | PTPRF | RAD21 | RAD50 | RASGRF1 | RB1 | ROBO1 | ROBO2 | RRAS | RUNX1 | SETBP1 | SETD2 | SF3B1 | SH2B3 | SMC1A | SMC3 | SMG1 | SPRED2 | SRCAP | SRSF2 | STAG2 | STAT3 | SUZ12 | TERT | TET2 | TP53 | U2AF1_157 | ⋯ | plus15 | r_1_7 | del22 | ring | WGA | component_0 | component_1 | component_2 | component_3 | component_4 | component_5 | component_6 | component_7 | component_8 | component_9 | component_10 | component_11 | predicted_component | max_proba | UNIQUE_PATIENT_ID | UNIQUE_SAMPLE_ID | SEX | MDS_TYPE | DIAG_IPSS | DIAG_IPSS_R | AOD | SAMPLE_TYPE | DIAG_RINGED_SIDEROBLASTS | SAMPLE_IPSS | SAMPLE_IPSS_R | AGE_AT_SAMPLE_TIME | SAMPLE_RINGED_SIDEROBLASTS | DMT | TYPE_OF_DISEASE_MODIFYING_TREATMENT | TRANSPLANT | TRANSPLANT_STAGE | TRANSPLANT_TYPE | PRIOR_CHEMOTHERAPY | PATIENT_RECEIVED_MORE_THAN_1_DMT | PATIENT_RECEIVED_MORE_THAN_1_TPL | DIAG_RINGED_SIDEROBLASTS_BINARY15 | CENTER | CENTER_COHORT | BATCH | FLAG_KEEP_REASON | DATE_OF_DIAGNOSIS_CORRECT | DATE_OF_SAMPLE_CORRECT | FLAG_ISSUE_DATE | AML | SEX_INFER_SEQUENCING | FLAG_ISSUE_SEX | DIAG_PB_BLAST_RANGE | SAMPLE_PB_BLAST_RANGE | DIAG_RINGED_SIDEROBLASTS_RANGE | DIAG_BM_BLAST_RANGE | SAMPLE_RINGED_SIDEROBLASTS_RANGE | SAMPLE_BM_BLAST_RANGE | DIAG_RINGED_SIDEROBLASTS_BINARY | SAMPLE_RINGED_SIDEROBLASTS_BINARY | SAMPLE_RINGED_SIDEROBLASTS_BINARY15 | QC_DETAILED | SELECTED_SAMPLE | time_diag_sample_days | time_diag_sample_range | type_of_dmt | DMT_STRICT | is_sample_naive | dmt_stage | WHO_2008 | FAB | PB_BLAST | LDH | FERRITIN | WBC | ANC | HB | PLT | MONOCYTES | BM_BLAST | CYTOGENETICS | NUMBER_OF_METAPHASES | IPSS | IPSS_R | RINGED_SIDEROBLASTS | RINGED_SIDEROBLASTS_BINARY | RINGED_SIDEROBLASTS_BINARY15 | WHO_2008_SIMPLIFY | time_diag_dmt_strict_days | time_diag_transplant_days | is_treated | dmt_active | lenalidomid | hma | transplant | os_diag_years | os_status | aml | time_diag_aml_days | aml_status | aml_diag_years |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| E-H-100000-T1-1-D1-1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0.002222222 | 0.1820833 | 0.003333333 | 0.01875000 | 0.22791667 | 0.01763889 | 0.5155556 | 0.01625000 | 0.012638889 | 0.003055556 | 0.0005555556 | 0.0000000000 | 6 | 0.5155556 | ihbt_1 | 276 | F | primary | low | good | 57 | BM granulocytes | NA | NA | NA | 57 | NA | yes | LEN | no | NA | NA | NA | NA | NA | NA | IHBT | IHBT_1 | B | pass | 9-mar-04 | 9-mar-04 | NA | no | F | NA | <5% | NA | NA | <5% | NA | NA | NA | NA | NA | pass | yes | 0 | < 1 month | len | yes | naive | after-sample | del(5q) | RA | 0 | 7.3 | 226 | 7.50 | 4.65 | 8.8 | 406 | NA | 3 | 46,xx,del(5)(q15q33.3)[3]/46,xx[1] | 22 | low | good | NA | NA | NA | 5q- | 2432 | NA | treated | yes | 1 | 0 | 0 | 5.819178 | 0 | no | NA | 0 | 5.819178 |
| E-H-100001-T1-1-D1-1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0.012592593 | 0.1511111 | 0.178518519 | 0.04259259 | 0.04777778 | 0.01111111 | 0.4777778 | 0.03074074 | 0.007037037 | 0.000000000 | 0.0403703704 | 0.0003703704 | 6 | 0.4777778 | ihbt_2 | 541 | F | primary | int1 | good | 56 | BM granulocytes | NA | NA | NA | 56 | NA | no | no | NA | NA | NA | NA | NA | NA | IHBT | IHBT_1 | B | pass | 23-feb-07 | 23-feb-07 | NA | no | F | NA | <5% | NA | NA | <5% | NA | NA | NA | NA | NA | pass | yes | 0 | < 1 month | no | naive | no-treatment | RA | RA | 0 | 5.0 | 103 | 7.21 | 2.88 | 9.8 | 364 | NA | 2 | 46,xx,del(5)(q13q33)[6]/47,xx,+8[1]/46,xx[15] | 22 | int1 | good | NA | NA | NA | RCUD | NA | NA | not-treated | no | 0 | 0 | 0 | 2.857534 | 0 | no | NA | 0 | 2.857534 |
get_colors <- function(n, add_color = c()) {
# Return a vector of n divergent and nice-looking colors
# → Ex : get_colors(3, add_color = c('grey', 'blue')) ⟹ c('#9E0142', '#FEE08B', '#88CFA4', 'grey', 'blue')
# → Arguments
# - n : number of colors
# - add_color: if specified, add the given color vector at the end of the generated color vector
colors <- c()
if (n >= 5)
colors <- colorRampPalette(brewer.pal(11, "Spectral"))(n)
else
colors <- c('#9E0142', '#FEE08B', '#88CFA4', '#5E4FA2')[1:n]
return (c(colors, add_color))
}
get_colors(3, add_color = c('grey', 'blue'))
plot_continuous_feature_by_categorical_feature <- function(data, continuous_feature_name, categorical_feature_name, scale_y_sqrt, legend_prefix = '') {
# Plot the distribution of a continuous feature across a categorical feature, produce boxplot, violin plot and density plots side by side
# → Arguments
# - data : dataframe with both the continous feature and the categorical feature as columns, the categorical feature MUST BE a factor
# - continuous_feature_name : name of the continuous feature to plot
# - categorical_feature_name: name of the categorical feature to plot
# - scale_y_sqrt : wheter to plot with a sqrt y scale or not
# - legend_prefix : add a prefix to a legend
# helper function to transform a category name (ex: cat_1) to its name and its count (ex: cat_1 (n = 42))
get_legend <- function(legend_prefix, x, categorical_count_table) {
return (sprintf('%s%s (n = %d)', legend_prefix, x, categorical_count_table[categorical_count_table$values == x,]$count))
}
# create new categorical feature by adding (n = count) for each category, ordered as the original factor categorical feature
categorical_count_table <- get_table(data[,categorical_feature_name], add_total_count = FALSE)
data$categorical <- sapply(data[,categorical_feature_name], function(x) get_legend(legend_prefix, x, categorical_count_table))
# order
data$categorical <- factor(data$categorical, levels = sapply(levels(data[,categorical_feature_name]), function(x) get_legend(legend_prefix, x, categorical_count_table)))
# remove NA values
n_initial <- nrow(data)
data <- data[! is.na(data[continuous_feature_name]),]
set_notebook_plot_size(25, 7)
colors <- get_colors(nrow(categorical_count_table))
# boxplot
p1 <- ggplot(data) +
geom_boxplot(aes_string('categorical', continuous_feature_name, fill = 'categorical'), alpha = 0.7, notch = TRUE) +
scale_fill_manual(values = colors) +
ggtitle(sprintf('%s density by %s\n(removed %d NA values)', continuous_feature_name, categorical_feature_name, n_initial - nrow(data))) +
theme(plot.title = element_text(size = 18, face = "bold")) +
scale_x_discrete(labels = levels(data[,categorical_feature_name]), breaks = waiver())
# violin plot
p2 <- ggplot(data) + geom_violin(aes_string('categorical', continuous_feature_name, fill = 'categorical'), alpha = 0.7) +
scale_fill_manual(values = colors) +
ggtitle(' \n ') +
theme(plot.title = element_text(size = 18, face = "bold")) +
scale_x_discrete(labels = levels(data[,categorical_feature_name]), breaks = waiver())
# density plot
p3 <- ggplot(data) + geom_density(aes_string(continuous_feature_name, fill = 'categorical'), alpha = 0.7) +
facet_wrap(~ categorical, ncol = 4) +
scale_fill_manual(values = colors) +
ggtitle(' \n ') +
theme(plot.title = element_text(size = 18, face = "bold"))
if (scale_y_sqrt) {
p1 <- p1 + scale_y_sqrt()
p2 <- p2 + scale_y_sqrt()
}
# plot the three plots side by side and remove ggplot warnings
suppressWarnings(suppressMessages(grid.arrange(p1, p2, p3, ncol = 3)))
}
# plot_continuous_feature_by_categorical_feature(dd_clinical, 'HB', 'predicted_component', TRUE, 'comp_')
# plot the distribution of multiple continuous features accross components
for (feature_name in c('HB', 'PLT', 'ANC', 'WBC', 'MONOCYTES', 'PB_BLAST', 'BM_BLAST', 'RINGED_SIDEROBLASTS'))
plot_continuous_feature_by_categorical_feature(dd_clinical, feature_name, 'predicted_component', TRUE)
plot_continuous_feature_by_categorical_feature(dd_clinical, 'AGE_AT_SAMPLE_TIME', 'predicted_component', FALSE)
plot_categorical_feature_by_categorical_feature <- function(data, feature_name_1, feature_name_2) {
# Plot the distribution of a categorical feature across another categorical feature
# → Arguments
# - data : dataframe with both the two categorical features
# - feature_name_1: first categorical feature
# - feature_name_2: second categorical feature
# remove NA values
n_initial <- nrow(data)
data <- data[!is.na(data[feature_name_1]),]
set_notebook_plot_size(25, 5)
colors <- get_colors(length(unique(data[,feature_name_2])))
# categorical feature 1 distribution
p1 <- ggplot(data) +
geom_bar(aes_string(feature_name_2, fill = feature_name_1), position = 'fill') +
ggtitle(sprintf('%s count and proportion by %s\n(removed %d NA values)', feature_name_1, feature_name_2, n_initial - nrow(data))) +
theme(plot.title = element_text(size = 18, face = "bold"))
# categorical feature 2 distribution
p2 <- ggplot(data) +
geom_bar(aes_string(feature_name_1, fill = feature_name_2), position = 'fill') +
scale_fill_manual(values = colors) +
ggtitle(' \n ') +
theme(plot.title = element_text(size = 18, face = "bold"))
# categorical feature count by component
p3 <- ggplot(data) +
geom_bar(aes_string(feature_name_1, fill = feature_name_2), position = 'dodge') +
scale_fill_manual(values = colors) +
ggtitle(' \n ') +
theme(plot.title = element_text(size = 18, face = "bold"))
# plot the three plots side by side and remove ggplot warnings
suppressWarnings(suppressMessages(grid.arrange(p1, p2, p3, ncol = 3)))
}
# plot_categorical_feature_by_categorical_feature(dd_clinical, 'AML', 'predicted_component')
# plot the distribution of multiple categorical features accross components
for (feature_name in c('WHO_2008_SIMPLIFY', 'SEX'))
plot_categorical_feature_by_categorical_feature(dd_clinical, feature_name, 'predicted_component')
# get countries data
dd_countries <- read.table('data/CentersBatches.csv', sep = ',', stringsAsFactors = FALSE, header = TRUE)
# merge with clinical data (that already contains mutation and cytogenetics data as well as clustering data)
dd_clinical <- merge(dd_clinical, unique(dd_countries[, c('center', 'country')]), by.x = 'CENTER', by.y = 'center')
print_size(dd_clinical)
head(dd_clinical, 2)
Size of dd_clinical: 3300 x 254
| CENTER | patient_key | ARID1A | ARID2 | ASXL1 | ASXL2 | ATRX | BCOR | BCORL1 | BRAF | BRCC3 | CALR | CBL | CDKN1B | CDKN2A | CEBPA | CREBBP | CSF1R | CSF3R | CSNK1A1 | CTCF | CUX1 | DDX23 | DDX4 | DDX41 | DDX54 | DHX33 | DICER1 | DNMT3A | DNMT3B | EED | EGFR | EP300 | ETNK1 | ETV6 | EZH2 | FAM175A | FLT3 | GATA1 | GATA2 | GNAS | GNB1 | HIPK2 | IDH1 | IDH2_140 | IDH2_172 | IRF1 | JAK2 | JARID2 | KDM5C | KDM6A | KIT | KMT2A | KMT2C | KMT2D | KRAS | LUC7L2 | MGA | MPL | MYC | NF1 | NFE2 | NIPBL | NOTCH1 | NOTCH2 | NPM1 | NRAS | PAPD5 | PHF6 | PHIP | PIK3CA | PPM1D | PRPF40B | PRPF8 | PTPN11 | PTPRF | RAD21 | RAD50 | RASGRF1 | RB1 | ROBO1 | ROBO2 | RRAS | RUNX1 | SETBP1 | SETD2 | SF3B1 | SH2B3 | SMC1A | SMC3 | SMG1 | SPRED2 | SRCAP | SRSF2 | STAG2 | STAT3 | SUZ12 | TERT | TET2 | TP53 | ⋯ | plus15 | r_1_7 | del22 | ring | WGA | component_0 | component_1 | component_2 | component_3 | component_4 | component_5 | component_6 | component_7 | component_8 | component_9 | component_10 | component_11 | predicted_component | max_proba | UNIQUE_PATIENT_ID | UNIQUE_SAMPLE_ID | SEX | MDS_TYPE | DIAG_IPSS | DIAG_IPSS_R | AOD | SAMPLE_TYPE | DIAG_RINGED_SIDEROBLASTS | SAMPLE_IPSS | SAMPLE_IPSS_R | AGE_AT_SAMPLE_TIME | SAMPLE_RINGED_SIDEROBLASTS | DMT | TYPE_OF_DISEASE_MODIFYING_TREATMENT | TRANSPLANT | TRANSPLANT_STAGE | TRANSPLANT_TYPE | PRIOR_CHEMOTHERAPY | PATIENT_RECEIVED_MORE_THAN_1_DMT | PATIENT_RECEIVED_MORE_THAN_1_TPL | DIAG_RINGED_SIDEROBLASTS_BINARY15 | CENTER_COHORT | BATCH | FLAG_KEEP_REASON | DATE_OF_DIAGNOSIS_CORRECT | DATE_OF_SAMPLE_CORRECT | FLAG_ISSUE_DATE | AML | SEX_INFER_SEQUENCING | FLAG_ISSUE_SEX | DIAG_PB_BLAST_RANGE | SAMPLE_PB_BLAST_RANGE | DIAG_RINGED_SIDEROBLASTS_RANGE | DIAG_BM_BLAST_RANGE | SAMPLE_RINGED_SIDEROBLASTS_RANGE | SAMPLE_BM_BLAST_RANGE | DIAG_RINGED_SIDEROBLASTS_BINARY | SAMPLE_RINGED_SIDEROBLASTS_BINARY | SAMPLE_RINGED_SIDEROBLASTS_BINARY15 | QC_DETAILED | SELECTED_SAMPLE | time_diag_sample_days | time_diag_sample_range | type_of_dmt | DMT_STRICT | is_sample_naive | dmt_stage | WHO_2008 | FAB | PB_BLAST | LDH | FERRITIN | WBC | ANC | HB | PLT | MONOCYTES | BM_BLAST | CYTOGENETICS | NUMBER_OF_METAPHASES | IPSS | IPSS_R | RINGED_SIDEROBLASTS | RINGED_SIDEROBLASTS_BINARY | RINGED_SIDEROBLASTS_BINARY15 | WHO_2008_SIMPLIFY | time_diag_dmt_strict_days | time_diag_transplant_days | is_treated | dmt_active | lenalidomid | hma | transplant | os_diag_years | os_status | aml | time_diag_aml_days | aml_status | aml_diag_years | country |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CCH | E-H-106024-T1-1-D1-1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0.07111111 | 0.13611111 | 0.000000000 | 0.00000000 | 0.38166667 | 0.2088889 | 0.003333333 | 0.1977778 | 0.0005555556 | 0.0000 | 0.000000000 | 0.0005555556 | 4 | 0.3816667 | 129 | 129 | M | primary | low | very_good | 70 | BM_MNC | NA | NA | NA | 70 | NA | no | NA | no | NA | NA | NA | NA | NA | NA | CCH_1 | D | pass | 7-apr-10 | 1-jun-10 | NA | no | M | NA | <5% | <5% | NA | <5% | NA | <5% | NA | NA | NA | pass | yes | 55 | 1 to 6 months | NA | no | naive | no-treatment | RCMD | RA | 0 | NA | NA | 8.5 | 6.5 | 12.4 | 141 | NA | 2 | 46,xy[29]/45,x,-y[3] | 32 | low | very_good | NA | NA | NA | RCMD | NA | NA | not-treated | no | 0 | 0 | 0 | 3.350685 | 0 | no | NA | 0 | 3.350685 | france |
| CCH | E-H-106025-T1-1-D1-1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0.02277778 | 0.03972222 | 0.002638889 | 0.09958333 | 0.03013889 | 0.1445833 | 0.009583333 | 0.6425000 | 0.0044444444 | 0.0025 | 0.001527778 | 0.0000000000 | 7 | 0.6425000 | 130 | 130 | M | primary | int1 | int | 75 | BM_MNC | NA | NA | NA | 75 | NA | no | NA | no | NA | NA | NA | NA | NA | NA | CCH_1 | D | pass | 26-jun-10 | 12-jul-10 | NA | no | M | NA | <5% | <5% | NA | <5% | NA | <5% | NA | NA | NA | pass | yes | 16 | < 1 month | NA | no | naive | no-treatment | RAEB-1 | RAEB | 0 | NA | NA | 6.8 | 4.9 | 11.0 | 48 | NA | 5 | 47,xy,+8[10]/46,xy[10] | 20 | int1 | int | NA | NA | NA | RAEB | NA | NA | not-treated | no | 0 | 0 | 0 | 2.794521 | 1 | no | NA | 0 | 2.794521 | france |
get_table(dd_clinical$country)
| values | count | freq |
|---|---|---|
| sweden | 889 | 26.9% |
| germany | 637 | 19.3% |
| italy | 572 | 17.3% |
| spain | 317 | 9.6% |
| netherland | 198 | 6% |
| france | 159 | 4.8% |
| brazil | 119 | 3.6% |
| taiwan | 107 | 3.2% |
| austria | 83 | 2.5% |
| united_states | 70 | 2.1% |
| greece | 66 | 2% |
| england | 50 | 1.5% |
| czech_republica | 33 | 1% |
| -- total -- | 3300 | 100% |
# gather european countries
dd_clinical$region <- dd_clinical$country
dd_clinical$region[dd_clinical$country %in% c('austria', 'czech_republica', 'england', 'france', 'germany', 'greece', 'italy', 'netherland', 'spain', 'sweden')] <- 'europe'
get_table(dd_clinical$region)
| values | count | freq |
|---|---|---|
| europe | 3004 | 91% |
| brazil | 119 | 3.6% |
| taiwan | 107 | 3.2% |
| united_states | 70 | 2.1% |
| -- total -- | 3300 | 100% |
plot_categorical_feature_by_categorical_feature(dd_clinical, 'region', 'predicted_component')
plot_survival_curve <- function(data, stratifier_column_name, colors, line_size = 1.5, legend_size = 15, legend_interspace = 1.3, surv_diag_years = 'os_diag_years', surv_status = 'os_status', cum_event = FALSE) {
# Plot the survival curve of the given data on the given stratifier
# → Arguments
# - data
# - stratifier_column_name: name of the column on which to stratify the survival plot
# - colors : vector of colors of the same size as the number of stratification
# - line_size
# - legend_size
# - legend_interspace : y space between legend lines
# - surv_diag_years : formula variable 1
# - surv_status : formula variable 2
# - cum_event : set to true for cumulative event curve rather than survival curve
if (cum_event) {
fun <- 'event'
ylab <- 'cumulative incidence'
}
else {
fun <- NULL
ylab <- 'survival probability'
}
# evaluate the survival curve
formula <- sprintf('Surv(%s, %s) ~ %s', surv_diag_years, surv_status, stratifier_column_name)
surv_fit <- surv_fit(as.formula(formula), data = data)
# set legend labels
short_names <- sapply(names(surv_fit$strata), function (s) strsplit(s, '=')[[1]][[2]])
legend_labels <- sprintf('%s (n = %s)', short_names, surv_fit$strata)
# create plot object
ggsurv <- ggsurvplot(surv_fit,
fun = fun,
censor = FALSE,
palette = colors,
linetype = 'solid',
size = line_size,
xlab = 'years',
ylab = ylab,
legend = c(0.6, 0.7),
legend.title = '',
legend.labs = legend_labels)
# modify plot object legend properties
ggsurv$plot <- ggsurv$plot + theme(legend.text = element_text(size = legend_size), legend.key.size = unit(legend_interspace, 'lines'))
# return ggplot object
return (ggsurv$plot)
}
set_notebook_plot_size(8, 4)
plot_survival_curve(dd_clinical, 'predicted_component', get_colors(11, 'grey'), line_size = 0.8, legend_size = 10, legend_interspace = 0.8)
plot_survival_curve_stratified_by_categorical_feature <- function(data, mask, wildtype_mask, stratifier_column_name, legend_prefix, colors, remove_na = FALSE, wildtype_name = 'wildtype', AML = FALSE) {
# Plot the survival curve of the given data on the given stratifier with the given mask and wildtypes
# → Arguments
# - data
# - mask : mask of the data on which to stratify
# - wildtype_mask : mask for the wildtype
# - stratifier_column_name: name of the column on which to stratify the survival plot
# - legend_prefix : prefix for the masked and stratified legend
# - colors : vector of colors of the same size as the number of stratification
# - remove_na : if TRUE remove NA on stratifier
# - wildtype_name : name of the wildtype curve
# - AML : whether to plot a cumulative incidence plot for AML progression or not
# create stratification
data$stratify <- wildtype_name
if (stratifier_column_name == '')
data[mask, 'stratify'] <- legend_prefix
else
data[mask, 'stratify'] <- sprintf('%s%s', legend_prefix, data[mask, stratifier_column_name])
data <- data[mask | wildtype_mask,]
# remove NA values
if (stratifier_column_name != '' & remove_na)
data <- data[wildtype_mask | (mask & ! is.na(data[,stratifier_column_name])),]
# plot
if (AML)
return (plot_survival_curve(data, 'stratify', colors, surv_diag_years = 'aml_diag_years', surv_status = 'aml_status', cum_event = TRUE))
else
return (plot_survival_curve(data, 'stratify', colors))
}
# set_notebook_plot_size(7, 7)
# plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
# mask = (dd_clinical$SF3B1 == 1),
# wildtype_mask = (dd_clinical$SF3B1 == 0),
# stratifier_column_name = '',
# legend_prefix = 'has_SF3B1_mutation',
# colors = c('orange', 'grey'),
# remove_na = FALSE)
plot_survival_curve_by_gene <- function(dd_clinical, gene_name, components, AML = FALSE) {
# Plot the survival curve of the given data on the given stratifier with the given gene and components stratifier
# → Arguments
# - dd_clinical
# - gene_name
# - components: vector of components on which to stratify
# - AML : whether to plot cumulative incidence plots for AML progression or not
# has gene vs wildtype
p1 <- plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = (dd_clinical[,gene_name] == 1),
wildtype_mask = (dd_clinical[,gene_name] == 0),
stratifier_column_name = '',
legend_prefix = sprintf('has_%s_mutation', gene_name),
colors = c('orange', 'grey'),
remove_na = FALSE,
AML = AML)
# stratify by given components and a "other" component
dd_clinical$predicted_component <- as.character(dd_clinical$predicted_component)
dd_clinical$predicted_component[! dd_clinical$predicted_component %in% components] <- 'other'
# has gene and belongs to component vs wildtype
p2 <- plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = (dd_clinical[,gene_name] == 1),
wildtype_mask = (dd_clinical[,gene_name] == 0),
stratifier_column_name = 'predicted_component',
legend_prefix = sprintf('_%s_comp_', gene_name),
colors = get_colors(length(components) + 1, 'grey'),
remove_na = FALSE,
AML = AML)
set_notebook_plot_size(20, 5)
grid.arrange(p1, p2, ncol = 2)
}
# plot_survival_curve_by_gene(dd_clinical, 'SF3B1', c(4, 6))
# plot survival curves stratified for selected gene and components
genes <- c('SF3B1', 'SRSF2', 'U2AF1_157', 'U2AF1_34', 'ZRSR2', 'del5q', 'TP53')
components <- list(c(4, 6), c(1, 3, 5), c(5, 7), c(8), c(1, 7), c(2, 6), c(2, 6))
for (i in 1:length(genes))
plot_survival_curve_by_gene(dd_clinical, genes[[i]], components[[i]])
get_table(dd_clinical$IPSS_R)
| values | count | freq |
|---|---|---|
| good | 924 | 36.7% |
| int | 530 | 21% |
| very_good | 394 | 15.6% |
| poor | 386 | 15.3% |
| very_poor | 286 | 11.3% |
| -- total -- | 3300 | 100% |
# merge some IPSSR categories
dd_clinical$IPSS_R_simplified <- dd_clinical$IPSS_R
dd_clinical$IPSS_R_simplified[dd_clinical$IPSS_R %in% c('very_good', 'good')] <- 'good'
dd_clinical$IPSS_R_simplified[dd_clinical$IPSS_R %in% c('very_poor', 'poor')] <- 'poor'
plot_categorical_feature_by_categorical_feature(dd_clinical, 'IPSS_R_simplified', 'predicted_component')
# plot survival curves stratified for selected IPSS categories and components
plots <- lapply(levels(dd_clinical$predicted_component), function(i) plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = (dd_clinical$predicted_component == i),
wildtype_mask = (dd_clinical$predicted_component != i),
stratifier_column_name = 'IPSS_R_simplified',
legend_prefix = sprintf('comp_%s_IPSSR', i),
colors = c('#ABDDA4', '#FDAE61', '#D7191C', 'grey'),
remove_na = TRUE))
set_notebook_plot_size(25, 25)
grid.arrange(grobs = plots, ncol = 3)
set_notebook_plot_size(5, 4)
plot_survival_curve(dd_clinical, 'IPSS_R_simplified', c('#ABDDA4', '#FDAE61', '#D7191C'))
set_notebook_plot_size(10, 4)
plot_survival_curve(dd_clinical, 'WHO_2008_SIMPLIFY', get_colors(9))
set_notebook_plot_size(7, 4)
mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'good' & dd_clinical$predicted_component %in% c(1, 3, 4, 6))
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'int')
plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = mask,
wildtype_mask = wildtype_mask,
stratifier_column_name = 'predicted_component',
legend_prefix = '_IPSSR_good_comp',
colors = get_colors(4, 'grey'),
wildtype_name = 'IPSSR_int')
set_notebook_plot_size(7, 4)
mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'int' & dd_clinical$predicted_component %in% c(1, 3, 4, 6))
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'poor')
plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = mask,
wildtype_mask = wildtype_mask,
stratifier_column_name = 'predicted_component',
legend_prefix = '_IPSSR_int_comp',
colors = get_colors(4, 'grey'),
wildtype_name = 'IPSSR_poor')
set_notebook_plot_size(7, 4)
mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'poor' & dd_clinical$predicted_component %in% c(2, 3, 4))
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'int')
plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = mask,
wildtype_mask = wildtype_mask,
stratifier_column_name = 'predicted_component',
legend_prefix = '_IPSSR_poor_comp',
colors = get_colors(3, 'grey'),
wildtype_name = 'IPSSR_int')
set_notebook_plot_size(7, 4)
mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'good' & dd_clinical$predicted_component == 3)
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'poor' & dd_clinical$predicted_component == 4)
plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = mask,
wildtype_mask = wildtype_mask,
stratifier_column_name = 'predicted_component',
legend_prefix = '_IPSSR_good_comp',
colors = get_colors(2),
wildtype_name = '_IPSSR_poor_comp_4')
set_notebook_plot_size(7, 4)
mask <- (! is.na(dd_clinical$WHO_2008_SIMPLIFY) & dd_clinical$WHO_2008_SIMPLIFY == '5q-' & dd_clinical$predicted_component == 6)
wildtype_mask <- (! is.na(dd_clinical$WHO_2008_SIMPLIFY) & dd_clinical$WHO_2008_SIMPLIFY == '5q-' & dd_clinical$predicted_component != 6)
plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = mask,
wildtype_mask = wildtype_mask,
stratifier_column_name = 'predicted_component',
legend_prefix = '_WHO_5q-_comp_',
colors = get_colors(2),
wildtype_name = '_WHO_5q-_other_comp')
# get maf data
dd_maf <- read.table('data/df_maf_driver_selected.tsv', sep = '\t', stringsAsFactors = FALSE, header = TRUE)
# merge with dd_clustered
dd_maf_clustered <- merge(dd_maf, dd_clustered[,c('patient_key', 'predicted_component')], by.x = 'TARGET_NAME', by.y = 'patient_key')
print_size(dd_maf_clustered)
head(dd_maf_clustered, 2)
Size of dd_maf_clustered: 11808 x 161
| TARGET_NAME | ID_VARIANT | REFERENCE_NAME | CHR | START | END | REF | ALT | CONTEXT_5 | CONTEXT_3 | QUAL | CALLED_BY | PASSED_BY | NUMBER_OF_CALLERS | FLAGS_ALL | TARGET_VAF_MEAN | TARGET_VAF_STD | REFERENCE_VAF_MEAN | REFERENCE_VAF_STD | mutect_TARGET_VAF | mutect_TARGET_DEPTH | mutect_REFERENCE_VAF | mutect_REFERENCE_DEPTH | mutect_DIRPROP | mutect_READS_FORWARD | mutect_READS_REVERSE | strelka_TARGET_VAF | strelka_TARGET_DEPTH | strelka_REFERENCE_VAF | strelka_REFERENCE_DEPTH | caveman.pindel_TARGET_VAF | caveman.pindel_TARGET_DEPTH | caveman.pindel_REFERENCE_VAF | caveman.pindel_REFERENCE_DEPTH | caveman.pindel_DIRPROP | caveman.pindel_READS_FORWARD | caveman.pindel_READS_REVERSE | VAG_VD | VAG_VT | VAG_GENE | VAG_TRANSCRIPT | VAG_RNA_CHANGE | VAG_cDNA_CHANGE | VAG_PROTEIN_CHANGE | VAG_EFFECT | PROTEIN_CHANGE | EFFECT | VAG_AMBIGUOUS_ANNOT | VEP_Consequence | VEP_IMPACT | VEP_SYMBOL | VEP_Feature_type | VEP_Feature | VEP_BIOTYPE | VEP_EXON | VEP_INTRON | VEP_HGVSc | VEP_HGVSp | VEP_Amino_acids | VEP_Codons | VEP_Existing_variation | VEP_DISTANCE | VEP_STRAND | VEP_FLAGS | VEP_VARIANT_CLASS | VEP_SYMBOL_SOURCE | VEP_HGNC_ID | VEP_CANONICAL | VEP_CCDS | VEP_SWISSPROT | VEP_TREMBL | VEP_UNIPARC | VEP_SOURCE | VEP_GENE_PHENO | VEP_SIFT | VEP_PolyPhen | VEP_DOMAINS | VEP_HGVS_OFFSET | VEP_AF | VEP_gnomAD_AF | VEP_MAX_AF | VEP_MAX_AF_POPS | VEP_CLIN_SIG | VEP_SOMATIC | VEP_PHENO | VEP_MOTIF_NAME | VEP_MOTIF_POS | VEP_HIGH_INF_POS | VEP_MOTIF_SCORE_CHANGE | VEP_gnomAD_genome | VEP_gnomAD_genome_AC.AN_AFR | VEP_gnomAD_genome_AF_AFR | VEP_gnomAD_genome_AC.AN_AMR | VEP_gnomAD_genome_AF_AMR | VEP_gnomAD_genome_AC.AN_ASJ | VEP_gnomAD_genome_AF_ASJ | VEP_gnomAD_genome_AC.AN_EAS | VEP_gnomAD_genome_AF_EAS | VEP_gnomAD_genome_AC.AN_FIN | VEP_gnomAD_genome_AF_FIN | VEP_gnomAD_genome_AC.AN_NFE | VEP_gnomAD_genome_AF_NFE | VEP_gnomAD_genome_AC.AN_OTH | VEP_gnomAD_genome_AF_OTH | VEP_gnomAD_genome_AC.AN_Male | VEP_gnomAD_genome_AF_Male | VEP_gnomAD_genome_AC.AN_Female | VEP_gnomAD_genome_AF_Female | VEP_gnomAD_exome | VEP_gnomAD_exome_AC.AN_AFR | VEP_gnomAD_exome_AF_AFR | VEP_gnomAD_exome_AC.AN_AMR | VEP_gnomAD_exome_AF_AMR | VEP_gnomAD_exome_AC.AN_ASJ | VEP_gnomAD_exome_AF_ASJ | VEP_gnomAD_exome_AC.AN_EAS | VEP_gnomAD_exome_AF_EAS | VEP_gnomAD_exome_AC.AN_FIN | VEP_gnomAD_exome_AF_FIN | VEP_gnomAD_exome_AC.AN_NFE | VEP_gnomAD_exome_AF_NFE | VEP_gnomAD_exome_AC.AN_OTH | VEP_gnomAD_exome_AF_OTH | VEP_gnomAD_exome_AC.AN_Male | VEP_gnomAD_exome_AF_Male | VEP_gnomAD_exome_AC.AN_Female | VEP_gnomAD_exome_AF_Female | COSMIC | CENTER | variant.key | NUM_FLAGS_MUTECT | NUM_FLAGS_STRELKA | NUM_FLAGS_CAVEMAN.PINDEL | NUM_FLAGS | sum_normals | sum_normals_caller | median_vaf_normals | FINAL_SOMATIC_ANNOTATION | Elsa_comment | Elsa_todoublecheck | Elsa_todo_check_indel | IS_TRUNCATED | IS_COSMIC_EXACT | IS_COSMIC_POS | COUNT | protein.key | aa_number | CENSUS | CENSUSKB | PAN_HOTSPOT | HEMEPACT_COUNT | ELLI_HOTSPOT | DRIVER | FLAG_DRIVER_CATEGORY | Elsa_oncogenic | VAF | REFERENCE_VAF | REMOVE_CURATION_MERGE | ANNOT_PHASE | ANNOT_PHASE_2 | predicted_component |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| E-H-100000-T1-1-D1-1 | c06198e2-9200-11e7-b5e7-eb21424a729f | I-H-100778-N1-1-D1-1 | 11 | 119149248 | 119149248 | G | A | TTTCT | CCGAT | NA | C,S,M | M,S | 2 | MNP | 0.083 | 0.003 | 0.001 | 0 | 0.087 | 917 | 0.001 | 1319 | 1.0 | 40 | 40 | 0.079 | 1308 | 0.001 | 1613 | 0.082 | 1308 | 0.001 | 1612 | 0.672 | 64 | 43 | CBL|CCDS8418.1|r.1632g>a|c.1256G>A|p.C419Y|protein_coding:CDS:exon:substitution:codon_variant:non_synonymous_codon|SO:0000010:SO:0000316:SO:0000147:SO:1000002:SO:0001581:SO:0001583 | Sub | CBL | CCDS8418.1 | r.1632g>a | c.1256G>A | p.C419Y | non_synonymous_codon | p.C419Y | non_synonymous_codon | 1 | missense_variant | MODERATE | CBL | Transcript | ENST00000264033 | protein_coding | 9|16 | ENST00000264033.4:c.1256G>A | ENSP00000264033.3:p.Cys419Tyr | C/Y | tGc/tAc | COSM5945221 | NA | 1 | SNV | HGNC | 1541 | YES | CCDS8418.1 | P22681 | UPI000013D4A7 | NA | 1 | deleterious(0) | probably_damaging(1) | Gene3D:3.30.40.10&Pfam_domain:PF13920&PROSITE_profiles:PS50089&hmmpanther:PTHR23007&hmmpanther:PTHR23007:SF5&SMART_domains:SM00184&Superfamily_domains:SSF57850 | NA | NA | NA | NA | 1 | 1 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 11:119149248-119149248 | 0 | 15304 | 0 | 0 | 33582 | 0 | 0 | 9850 | 0 | 0 | 17248 | 0 | 0 | 22300 | 0 | 1 | 111614 | 8.96e-06 | 0 | 5482 | 0 | 1 | 134846 | 7.42e-06 | 0 | 111316 | 0 | GENOMIC_EXACT;CBL_p.C419Y;ALL=2;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2|GENOMIC_CLOSE;CBL_p.C416Y;ALL=3;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=3|GENOMIC_CLOSE;CBL_p.C416F;ALL=1;SOM=1;GENITAL_TRACT=1|GENOMIC_CLOSE;CBL_p.C416W;ALL=2;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2|GENOMIC_CLOSE;CBL_p.P417A;ALL=2;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2|GENOMIC_CLOSE;CBL_p.P417S;ALL=2;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2|GENOMIC_CLOSE;CBL_p.P417H;ALL=2;SOM=2;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1;SKIN=1|GENOMIC_CLOSE;CBL_p.P417R;ALL=2;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2|GENOMIC_CLOSE;CBL_p.P417L;ALL=7;SOM=7;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=4;SKIN=3|GENOMIC_CLOSE;CBL_p.F418I;ALL=2;SOM=2;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1;SKIN=1|GENOMIC_CLOSE;CBL_p.F418S;ALL=6;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=6|GENOMIC_CLOSE;CBL_p.F418L;ALL=3;SOM=3;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2;SKIN=1|GENOMIC_CLOSE;CBL_p.C419R;ALL=2;SOM=2;GENITAL_TRACT=1;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1|GENOMIC_CLOSE;CBL_p.R420G;ALL=2;SOM=2;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1;SKIN=1|GENOMIC_CLOSE;CBL_p.R420*;ALL=2;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1;ENDOMETRIUM=1|GENOMIC_CLOSE;CBL_p.R420Q;ALL=17;SOM=17;CENTRAL_NERVOUS_SYSTEM=1;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=16|GENOMIC_CLOSE;CBL_p.R420P;ALL=2;SOM=2;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2|GENOMIC_CLOSE;CBL_p.R420L;ALL=1;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1|GENOMIC_CLOSE;CBL_p.I423_E427delIKGTE;ALL=1;SOM=1;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1|GENOMIC_CLOSE;CBL_p.C421W;ALL=1;SOM=1;LARGE_INTESTINE=1 | IHBT | 11_119149248_G_A | 0 | 0 | 0 | 0 | 0 | 0 | 0 | OK | NA | NA | not-truncated | 1 | 0 | 1 | CBL_C419 | 419 | oncogene/TSG | TSG | 0 | 0 | 1 | driver | recurrence_cosmic | 0.083 | 0.001 | 6 | ||||||||||||||||||||
| E-H-100000-T1-1-D1-1 | 5_131822301_G_T | I-H-100778-N1-1-D1-1 | 5 | 131822301 | 131822301 | G | T | ACTGT | TAGCT | NA | S,M | M,S | 2 | PASS | 0.022 | 0.002 | 0.002 | 0 | 0.024 | 374 | 0.002 | 1145 | 0.8 | 5 | 4 | 0.021 | 532 | 0.001 | 1376 | NA | NA | NA | NA | NA | NA | NA | IRF1|CCDS4155.1|r.751c>a|c.492C>A|p.Y164*|protein_coding:CDS:exon:substitution:codon_variant:stop_gained|SO:0000010:SO:0000316:SO:0000147:SO:1000002:SO:0001581:SO:0001587 | Sub | IRF1 | CCDS4155.1 | r.751c>a | c.492C>A | p.Y164* | stop_gained | p.Y164* | stop_gained | 1 | stop_gained | HIGH | IRF1 | Transcript | ENST00000245414 | protein_coding | 6|10 | ENST00000245414.4:c.492C>A | ENSP00000245414.4:p.Tyr164Ter | Y/* | taC/taA | NA | -1 | SNV | HGNC | 6116 | YES | CCDS4155.1 | P10914 | R4GNI0&Q75MZ8&Q6FHN8&C9JD95 | UPI000012D885 | NA | 1 | PIRSF_domain:PIRSF038196&hmmpanther:PTHR11949&hmmpanther:PTHR11949:SF3 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | GENOMIC_CLOSE;IRF1_p.P167fs*42;ALL=1;SOM=1;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1|GENOMIC_CLOSE;IRF1_p.Y164C;ALL=1;SOM=1;BONE=1 | IHBT | 5_131822301_G_T | 0 | 0 | 0 | 0 | 0 | 0 | 0 | OK | NA | NA | truncated | 0 | 0 | 1 | IRF1_Y164 | 164 | not_known | not_known | 0 | 0 | 0 | driver | NA | possible | 0.022 | 0.002 | 6 |
plot_continuous_feature_by_categorical_feature(dd_maf_clustered, 'TARGET_VAF_MEAN', 'predicted_component', FALSE)
plot_continuous_feature_stratified_by_component_and_gene <- function(data, gene, components) {
data <- data[data$VAG_GENE == gene,]
data$category <- sprintf('%s_other', gene)
mask <- data$VAG_GENE == gene & data$predicted_component %in% components
data$category[mask] <- sprintf('%s_comp_%s', gene, data$predicted_component[mask])
data$category <- factor(data$category)
plot_continuous_feature_by_categorical_feature(data, 'VAF', 'category', FALSE)
}
# plot_continuous_feature_stratified_by_component_and_gene(dd_maf_clustered, 'SF3B1', c(4, 6))
# plot VAF distribution stratified by selected genes and components
genes <- c('SF3B1', 'SRSF2', 'U2AF1', 'ZRSR2', 'TP53')
components <- list(c(4, 6), c(1, 3, 5), c(5, 7, 8), c(1, 7), c(2, 6))
for (i in 1:length(genes))
plot_continuous_feature_stratified_by_component_and_gene(dd_maf_clustered, genes[[i]], components[[i]])
plot_categorical_feature_by_categorical_feature(dd_clinical, 'AML', 'predicted_component')
set_notebook_plot_size(12, 4)
plot_survival_curve(dd_clinical, 'predicted_component', get_colors(11, 'grey'), line_size = 0.8, legend_size = 10, legend_interspace = 0.8, surv_diag_years = 'aml_diag_years', surv_status = 'aml_status', cum_event = TRUE)
# plot AML cumulative incidence curves stratified for selected gene and components
genes <- c('SF3B1', 'SRSF2', 'U2AF1_157', 'U2AF1_34', 'ZRSR2', 'del5q', 'TP53')
components <- list(c(4, 6), c(1, 3, 5), c(5, 7), c(8), c(1, 7), c(2, 6), c(2, 6))
for (i in 1:length(genes))
plot_survival_curve_by_gene(dd_clinical, genes[[i]], components[[i]], AML = TRUE)
set_notebook_plot_size(5, 4)
plot_survival_curve(dd_clinical, 'IPSS_R_simplified', c('#ABDDA4', '#FDAE61', '#D7191C'), surv_diag_years = 'aml_diag_years', surv_status = 'aml_status', cum_event = TRUE)
set_notebook_plot_size(10, 4)
plot_survival_curve(dd_clinical, 'WHO_2008_SIMPLIFY', get_colors(9), surv_diag_years = 'aml_diag_years', surv_status = 'aml_status', cum_event = TRUE)
set_notebook_plot_size(7, 4)
mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'good' & dd_clinical$predicted_component %in% c(1, 3, 4, 6))
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'int')
plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = mask,
wildtype_mask = wildtype_mask,
stratifier_column_name = 'predicted_component',
legend_prefix = '_IPSSR_good_comp',
colors = get_colors(4, 'grey'),
wildtype_name = 'IPSSR_int',
AML = TRUE)
set_notebook_plot_size(7, 4)
mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'int' & dd_clinical$predicted_component %in% c(1, 3, 4, 6))
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'poor')
plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = mask,
wildtype_mask = wildtype_mask,
stratifier_column_name = 'predicted_component',
legend_prefix = '_IPSSR_int_comp',
colors = get_colors(4, 'grey'),
wildtype_name = 'IPSSR_poor',
AML = TRUE)
set_notebook_plot_size(7, 4)
mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'poor' & dd_clinical$predicted_component %in% c(2, 3, 4))
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'int')
plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = mask,
wildtype_mask = wildtype_mask,
stratifier_column_name = 'predicted_component',
legend_prefix = '_IPSSR_poor_comp',
colors = get_colors(3, 'grey'),
wildtype_name = 'IPSSR_int',
AML = TRUE)
Work in progress...
# let's transform every variable we are going to use to factors
dd_clinical$IPSS_R <- factor(dd_clinical$IPSS_R)
dd_clinical$SEX <- factor(dd_clinical$SEX)
dd_clinical$dmt_active <- factor(dd_clinical$dmt_active)
dd_clinical$lenalidomid <- factor(dd_clinical$lenalidomid)
dd_clinical$hma <- factor(dd_clinical$hma)
dd_clinical$transplant <- factor(dd_clinical$transplant)
Let's compare the concordance score with 4 sets of features:
for (features in c('AGE_AT_SAMPLE_TIME + SEX + dmt_active + lenalidomid + hma + transplant',
'AGE_AT_SAMPLE_TIME + SEX + predicted_component + dmt_active + lenalidomid + hma + transplant',
'AGE_AT_SAMPLE_TIME + SEX + IPSS_R + dmt_active + lenalidomid + hma + transplant',
'AGE_AT_SAMPLE_TIME + SEX + IPSS_R + predicted_component + dmt_active + lenalidomid + hma + transplant')) {
formula <- sprintf('Surv(os_diag_years, os_status) ~ %s', features)
cat(sprintf('Model: %s\n', formula))
res <- coxph(as.formula(formula), data = dd_clinical)
cat(sprintf(' ▴ Concordance: %.3f\n\n', res$concordance['concordance']))
}
Model: Surv(os_diag_years, os_status) ~ AGE_AT_SAMPLE_TIME + SEX + dmt_active + lenalidomid + hma + transplant ▴ Concordance: 0.609 Model: Surv(os_diag_years, os_status) ~ AGE_AT_SAMPLE_TIME + SEX + predicted_component + dmt_active + lenalidomid + hma + transplant ▴ Concordance: 0.702 Model: Surv(os_diag_years, os_status) ~ AGE_AT_SAMPLE_TIME + SEX + IPSS_R + dmt_active + lenalidomid + hma + transplant ▴ Concordance: 0.733 Model: Surv(os_diag_years, os_status) ~ AGE_AT_SAMPLE_TIME + SEX + IPSS_R + predicted_component + dmt_active + lenalidomid + hma + transplant ▴ Concordance: 0.746
Some detail results on the last set of features significance:
res <- coxph(Surv(os_diag_years, os_status) ~ AGE_AT_SAMPLE_TIME + SEX + IPSS_R + predicted_component + dmt_active + lenalidomid + hma + transplant, data = dd_clinical)
summary(res)
set_notebook_plot_size(8, 6)
ggforest(res, fontsize = 0.5)
Call:
coxph(formula = Surv(os_diag_years, os_status) ~ AGE_AT_SAMPLE_TIME +
SEX + IPSS_R + predicted_component + dmt_active + lenalidomid +
hma + transplant, data = dd_clinical)
n= 2495, number of events= 1134
(805 observations deleted due to missingness)
coef exp(coef) se(coef) z Pr(>|z|)
AGE_AT_SAMPLE_TIME 0.028949 1.029372 0.003291 8.796 < 2e-16 ***
SEXM 0.231227 1.260146 0.064021 3.612 0.000304 ***
IPSS_Rint 0.453103 1.573186 0.085832 5.279 1.30e-07 ***
IPSS_Rpoor 0.947454 2.579136 0.096018 9.867 < 2e-16 ***
IPSS_Rvery_good -0.470838 0.624479 0.116506 -4.041 5.32e-05 ***
IPSS_Rvery_poor 1.430524 4.180890 0.103763 13.786 < 2e-16 ***
predicted_component2 0.738750 2.093318 0.112301 6.578 4.76e-11 ***
predicted_component3 0.536538 1.710076 0.105752 5.074 3.90e-07 ***
predicted_component4 -0.164761 0.848096 0.111101 -1.483 0.138079
predicted_component5 0.778740 2.178726 0.124441 6.258 3.90e-10 ***
predicted_component6 -0.032367 0.968151 0.135705 -0.239 0.811487
predicted_component7 0.521346 1.684293 0.134042 3.889 0.000100 ***
predicted_component8 0.270040 1.310017 0.150685 1.792 0.073119 .
predicted_component9 0.725433 2.065626 0.216309 3.354 0.000797 ***
predicted_component10 0.586215 1.797173 0.294396 1.991 0.046454 *
predicted_component11 0.147411 1.158830 0.458527 0.321 0.747840
predicted_componentNaN -0.208332 0.811937 0.214772 -0.970 0.332037
dmt_activeyes 0.326444 1.386031 0.125711 2.597 0.009410 **
lenalidomid1 -0.170154 0.843535 0.205710 -0.827 0.408150
hma1 -0.224744 0.798721 0.121858 -1.844 0.065139 .
transplant1 -0.771528 0.462306 0.132226 -5.835 5.38e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
exp(coef) exp(-coef) lower .95 upper .95
AGE_AT_SAMPLE_TIME 1.0294 0.9715 1.0228 1.0360
SEXM 1.2601 0.7936 1.1115 1.4286
IPSS_Rint 1.5732 0.6357 1.3296 1.8614
IPSS_Rpoor 2.5791 0.3877 2.1367 3.1132
IPSS_Rvery_good 0.6245 1.6013 0.4970 0.7847
IPSS_Rvery_poor 4.1809 0.2392 3.4115 5.1238
predicted_component2 2.0933 0.4777 1.6797 2.6087
predicted_component3 1.7101 0.5848 1.3900 2.1039
predicted_component4 0.8481 1.1791 0.6821 1.0544
predicted_component5 2.1787 0.4590 1.7072 2.7805
predicted_component6 0.9682 1.0329 0.7420 1.2632
predicted_component7 1.6843 0.5937 1.2952 2.1904
predicted_component8 1.3100 0.7633 0.9750 1.7601
predicted_component9 2.0656 0.4841 1.3519 3.1563
predicted_component10 1.7972 0.5564 1.0093 3.2002
predicted_component11 1.1588 0.8629 0.4718 2.8465
predicted_componentNaN 0.8119 1.2316 0.5330 1.2369
dmt_activeyes 1.3860 0.7215 1.0833 1.7733
lenalidomid1 0.8435 1.1855 0.5636 1.2624
hma1 0.7987 1.2520 0.6290 1.0142
transplant1 0.4623 2.1631 0.3568 0.5991
Concordance= 0.746 (se = 0.008 )
Rsquare= 0.266 (max possible= 0.998 )
Likelihood ratio test= 772.2 on 21 df, p=<2e-16
Wald test = 789.8 on 21 df, p=<2e-16
Score (logrank) test = 907.8 on 21 df, p=<2e-16
Warning message in .get_data(model, data = data): “The `data` argument is not provided. Data will be extracted from model fit.”Warning message: “Removed 7 rows containing missing values (geom_errorbar).”
mean_imputation <- function(data, column_name) {
# Proceed mean imputation on selected column
# → Arguments
# - data
# - column_name: column on which to impute values
cat(sprintf('Mean imputation for %s (%d NA values)\n', column_name, nrow(data[is.na(data[, column_name]),])))
data[is.na(data[, column_name]), column_name] <- mean(data[, column_name], na.rm = TRUE)
return (data)
}
# prepare data
data <- dd_clinical
data <- mean_imputation(data, 'AGE_AT_SAMPLE_TIME')
for (f in c('AGE_AT_SAMPLE_TIME', 'SEX', 'IPSS_R', 'predicted_component', 'dmt_active', 'lenalidomid', 'hma', 'transplant')) {
cat(sprintf('Feature %-19s: removed %3d NA values\n', f, nrow(data[is.na(data[, f]),])))
data <- data[! is.na(data[, f]),]
}
print_size(data)
y <- data[, c('os_diag_years', 'os_status')]
colnames(y) <- c('time', 'status')
x <- model.matrix(~ AGE_AT_SAMPLE_TIME + SEX + IPSS_R + predicted_component + dmt_active + lenalidomid + hma + transplant, data)
Mean imputation for AGE_AT_SAMPLE_TIME (20 NA values) Feature AGE_AT_SAMPLE_TIME : removed 0 NA values Feature SEX : removed 0 NA values Feature IPSS_R : removed 780 NA values Feature predicted_component: removed 0 NA values Feature dmt_active : removed 16 NA values Feature lenalidomid : removed 0 NA values Feature hma : removed 0 NA values Feature transplant : removed 0 NA values Size of data: 2504 x 256
# remove NA status values and negative time values
rm_mask <- is.na(y[,'status']) | y[,'time'] <= 0
sum(rm_mask)
x <- x[! rm_mask,]
y <- y[! rm_mask,]
y <- as.matrix(y)
print_size(x)
print_size(y)
Size of x: 2477 x 22 Size of y: 2477 x 2
glmnet
fit: fitted modelplot(fit): each curve corresponds to a variable, it shows the path of its coefficient against the $\ell_1$-norm of the whole coefficient vector at as $\lambda$ varies. The axis above indicates the number of nonzero coefficients at the current $\lambda$, which is the effective degrees of freedom (df) for the lasso.# fit one model
fit = glmnet(x, y, family = "cox")
# plot lasso path
set_notebook_plot_size(10, 10)
par(mar = c(4, 4, 1, 15))
plot(fit, label = TRUE)
vnat = coef(fit)
vnat = vnat[-1, ncol(vnat)] # remove intercept and get coefficients at the end of the path
axis(4, at = vnat, line = -.4, label = colnames(x)[-1], las = 1, tick = FALSE, cex.axis = 0.7)
print(fit): summary of path at each step with the following columns:Df: number of nonzero coefficients%Dev: percent of deviance explainedprint(fit)
Call: glmnet(x = x, y = y, family = "cox")
Df %Dev Lambda
[1,] 0 0.000000 0.187900
[2,] 1 0.003768 0.171200
[3,] 1 0.006466 0.156000
[4,] 1 0.008464 0.142100
[5,] 2 0.010280 0.129500
[6,] 4 0.014560 0.118000
[7,] 6 0.019020 0.107500
[8,] 6 0.022850 0.097950
[9,] 6 0.026000 0.089250
[10,] 6 0.028610 0.081320
[11,] 6 0.030770 0.074090
[12,] 7 0.032730 0.067510
[13,] 9 0.034760 0.061510
[14,] 11 0.036860 0.056050
[15,] 12 0.038710 0.051070
[16,] 12 0.040530 0.046530
[17,] 13 0.042040 0.042400
[18,] 13 0.043320 0.038630
[19,] 14 0.044390 0.035200
[20,] 16 0.045450 0.032070
[21,] 15 0.046360 0.029220
[22,] 15 0.047100 0.026630
[23,] 15 0.047710 0.024260
[24,] 17 0.048290 0.022110
[25,] 17 0.048830 0.020140
[26,] 17 0.049280 0.018350
[27,] 17 0.049650 0.016720
[28,] 18 0.049960 0.015240
[29,] 18 0.050240 0.013880
[30,] 18 0.050480 0.012650
[31,] 18 0.050680 0.011530
[32,] 18 0.050840 0.010500
[33,] 18 0.050980 0.009569
[34,] 18 0.051100 0.008719
[35,] 18 0.051190 0.007945
[36,] 18 0.051270 0.007239
[37,] 18 0.051340 0.006596
[38,] 18 0.051390 0.006010
[39,] 19 0.051440 0.005476
[40,] 19 0.051510 0.004990
[41,] 20 0.051570 0.004546
[42,] 20 0.051620 0.004142
[43,] 20 0.051660 0.003774
[44,] 20 0.051700 0.003439
[45,] 20 0.051730 0.003134
[46,] 21 0.051760 0.002855
[47,] 21 0.051780 0.002602
[48,] 21 0.051800 0.002370
[49,] 21 0.051820 0.002160
[50,] 21 0.051830 0.001968
[51,] 21 0.051840 0.001793
[52,] 21 0.051850 0.001634
cvfit: compute $k$-fold cross-validationplot(cvfit): view the optimal $\lambda$ value and a cross validated error plot# 10-folds cross validation to find lambda
cvfit = cv.glmnet(x, y, family = 'cox', alpha = 1, nfolds = 10, grouped = TRUE)
# plot results
set_notebook_plot_size(10, 6)
plot(cvfit)
cvfit$lambda.min
cvfit$lambda.1se
coef(cvfit, s = 'lambda.1se')
22 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) .
AGE_AT_SAMPLE_TIME 0.02089268
SEXM 0.11275873
IPSS_Rint 0.20398802
IPSS_Rpoor 0.76639071
IPSS_Rvery_good -0.39694028
IPSS_Rvery_poor 1.25601535
predicted_component2 0.40408349
predicted_component3 0.21462811
predicted_component4 -0.23347151
predicted_component5 0.37399522
predicted_component6 -0.05011404
predicted_component7 0.06984766
predicted_component8 .
predicted_component9 .
predicted_component10 .
predicted_component11 .
predicted_componentNaN -0.04734575
dmt_activeyes 0.03872877
lenalidomid1 .
hma1 .
transplant1 -0.36691068
# format coef to an easy-to-use dataframe
coefs <- as.data.frame(as.matrix(coef(cvfit, s = 'lambda.1se')[-1,])) # remove intercept and convert to dataframe
colnames(coefs) <- 'coefficient_value'
coefs$feature_name <- rownames(coefs)
coefs$feature_group <- c('other', 'other', 'IPSSR', 'IPSSR', 'IPSSR', 'IPSSR', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'other', 'other', 'other', 'other')
rownames(coefs) <- NULL
coefs <- coefs[order(coefs$coefficient_value),]
coefs$feature_name <- factor(coefs$feature_name, levels = coefs$feature_name)
coefs
| coefficient_value | feature_name | feature_group | |
|---|---|---|---|
| 5 | -0.39694028 | IPSS_Rvery_good | IPSSR |
| 21 | -0.36691068 | transplant1 | other |
| 9 | -0.23347151 | predicted_component4 | component |
| 11 | -0.05011404 | predicted_component6 | component |
| 17 | -0.04734575 | predicted_componentNaN | component |
| 13 | 0.00000000 | predicted_component8 | component |
| 14 | 0.00000000 | predicted_component9 | component |
| 15 | 0.00000000 | predicted_component10 | component |
| 16 | 0.00000000 | predicted_component11 | component |
| 19 | 0.00000000 | lenalidomid1 | other |
| 20 | 0.00000000 | hma1 | other |
| 1 | 0.02089268 | AGE_AT_SAMPLE_TIME | other |
| 18 | 0.03872877 | dmt_activeyes | other |
| 12 | 0.06984766 | predicted_component7 | component |
| 2 | 0.11275873 | SEXM | other |
| 3 | 0.20398802 | IPSS_Rint | IPSSR |
| 8 | 0.21462811 | predicted_component3 | component |
| 10 | 0.37399522 | predicted_component5 | component |
| 7 | 0.40408349 | predicted_component2 | component |
| 4 | 0.76639071 | IPSS_Rpoor | IPSSR |
| 6 | 1.25601535 | IPSS_Rvery_poor | IPSSR |
ggplot(coefs) + geom_bar(aes(feature_name, coefficient_value, fill = feature_group), stat = 'identity') + coord_flip()
Let's run the experiment above 100 times with different random sampling of the data, and look at the probability of being null for each feature:
features_list <- data.frame(feature_name = colnames(x)[-1], count_not_null = 0)
features_list$feature_group <- c('other', 'other', 'IPSSR', 'IPSSR', 'IPSSR', 'IPSSR', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'other', 'other', 'other', 'other')
n_experiment <- 100
for (i in 1:n_experiment) {
print_and_flush(sprintf('\rExperiment %d/%d...', i, n_experiment))
# sample 80% of x and y with replacement
set.seed(i * 100)
sampling <- sample(1:nrow(x), 0.8 * nrow(x), replace = TRUE)
cvfit <- cv.glmnet(x[sampling,], y[sampling,], family = 'cox', alpha = 1, nfolds = 10, grouped = TRUE)
coefs <- as.data.frame(as.matrix(coef(cvfit, s = 'lambda.1se')[-1,])) # remove intercept and convert to dataframe
colnames(coefs) <- 'coefficient_value'
coefs$feature_name <- rownames(coefs)
rownames(coefs) <- NULL
coefs$not_null <- coefs$coefficient_value != 0
features_list$count_not_null <- features_list$count_not_null + coefs$not_null
}
cat(' done!')
features_list$count_not_null <- features_list$count_not_null / n_experiment
features_list
Experiment 100/100... done!
| feature_name | count_not_null | feature_group |
|---|---|---|
| AGE_AT_SAMPLE_TIME | 1.00 | other |
| SEXM | 0.88 | other |
| IPSS_Rint | 0.93 | IPSSR |
| IPSS_Rpoor | 1.00 | IPSSR |
| IPSS_Rvery_good | 1.00 | IPSSR |
| IPSS_Rvery_poor | 1.00 | IPSSR |
| predicted_component2 | 1.00 | component |
| predicted_component3 | 0.94 | component |
| predicted_component4 | 1.00 | component |
| predicted_component5 | 1.00 | component |
| predicted_component6 | 0.72 | component |
| predicted_component7 | 0.55 | component |
| predicted_component8 | 0.12 | component |
| predicted_component9 | 0.25 | component |
| predicted_component10 | 0.25 | component |
| predicted_component11 | 0.01 | component |
| predicted_componentNaN | 0.61 | component |
| dmt_activeyes | 0.46 | other |
| lenalidomid1 | 0.01 | other |
| hma1 | 0.46 | other |
| transplant1 | 1.00 | other |
# order
features_list <- features_list[order(features_list$count_not_null),]
features_list$feature_name <- factor(features_list$feature_name, levels = features_list$feature_name)
ggplot(features_list) + geom_bar(aes(feature_name, count_not_null, fill = feature_group), stat = 'identity') + coord_flip()
Let's make the same two plots but adding genes and cytogenetics features:
# prepare data
data <- dd_clinical
data <- mean_imputation(data, 'AGE_AT_SAMPLE_TIME')
for (f in c('AGE_AT_SAMPLE_TIME', 'SEX', 'IPSS_R', 'predicted_component', 'dmt_active', 'lenalidomid', 'hma', 'transplant')) {
cat(sprintf('Feature %-19s: removed %3d NA values\n', f, nrow(data[is.na(data[, f]),])))
data <- data[! is.na(data[, f]),]
}
print_size(data)
y <- data[, c('os_diag_years', 'os_status')]
colnames(y) <- c('time', 'status')
formula <- sprintf('~ AGE_AT_SAMPLE_TIME + SEX + IPSS_R + predicted_component + dmt_active + lenalidomid + hma + transplant + %s', paste(colnames(dd_all), collapse = ' + '))
x <- model.matrix(as.formula(formula), data)
Mean imputation for AGE_AT_SAMPLE_TIME (20 NA values) Feature AGE_AT_SAMPLE_TIME : removed 0 NA values Feature SEX : removed 0 NA values Feature IPSS_R : removed 780 NA values Feature predicted_component: removed 0 NA values Feature dmt_active : removed 16 NA values Feature lenalidomid : removed 0 NA values Feature hma : removed 0 NA values Feature transplant : removed 0 NA values Size of data: 2504 x 256
rm_mask <- is.na(y[,'status']) | y[,'time'] <= 0
sum(rm_mask)
x <- x[! rm_mask,]
y <- y[! rm_mask,]
y <- as.matrix(y)
print_size(x)
print_size(y)
Size of x: 2477 x 179 Size of y: 2477 x 2
# fit one model
fit = glmnet(x, y, family = "cox")
# plot lasso path
set_notebook_plot_size(20, 10)
plot(fit)
# 10-folds cross validation to find lambda
cvfit = cv.glmnet(x, y, family = 'cox', alpha = 1, nfolds = 10, grouped = TRUE)
# plot results
set_notebook_plot_size(10, 6)
plot(cvfit)
cvfit$lambda.min
cvfit$lambda.1se
# format coef to an easy-to-use dataframe
coefs <- as.data.frame(as.matrix(coef(cvfit, s = 'lambda.1se')[-1,])) # remove intercept and convert to dataframe
colnames(coefs) <- 'coefficient_value'
coefs$feature_name <- rownames(coefs)
coefs$feature_group <- c('other', 'other', 'IPSSR', 'IPSSR', 'IPSSR', 'IPSSR', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'other', 'other', 'other', 'other',
rep('gene', 107), rep('cyto', 50))
rownames(coefs) <- NULL
coefs <- coefs[order(coefs$coefficient_value),]
coefs$feature_name <- factor(coefs$feature_name, levels = coefs$feature_name)
set_notebook_plot_size(10, 25)
ggplot(coefs) + geom_bar(aes(feature_name, coefficient_value, fill = feature_group), stat = 'identity') + coord_flip()
features_list <- data.frame(feature_name = colnames(x)[-1], count_not_null = 0)
features_list$feature_group <- c('other', 'other', 'IPSSR', 'IPSSR', 'IPSSR', 'IPSSR', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'other', 'other', 'other', 'other',
rep('gene', 107), rep('cyto', 50))
n_experiment <- 100
for (i in 1:n_experiment) {
print_and_flush(sprintf('\rExperiment %d/%d...', i, n_experiment))
# sample 80% of x and y with replacement
set.seed(i * 100)
sampling <- sample(1:nrow(x), 0.8 * nrow(x), replace = TRUE)
cvfit <- cv.glmnet(x[sampling,], y[sampling,], family = 'cox', alpha = 1, nfolds = 10, grouped = TRUE)
coefs <- as.data.frame(as.matrix(coef(cvfit, s = 'lambda.1se')[-1,])) # remove intercept and convert to dataframe
colnames(coefs) <- 'coefficient_value'
coefs$feature_name <- rownames(coefs)
rownames(coefs) <- NULL
coefs$not_null <- coefs$coefficient_value != 0
features_list$count_not_null <- features_list$count_not_null + coefs$not_null
}
cat(' done!')
features_list$count_not_null <- features_list$count_not_null / n_experiment
Experiment 100/100... done!
# order
features_list <- features_list[order(features_list$count_not_null),]
features_list$feature_name <- factor(features_list$feature_name, levels = features_list$feature_name)
ggplot(features_list) + geom_bar(aes(feature_name, count_not_null, fill = feature_group), stat = 'identity') + coord_flip()
set_notebook_plot_size(7, 4)
mask <- (dd_clinical$MYC == 1)
wildtype_mask <- ! mask
plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = mask,
wildtype_mask = wildtype_mask,
stratifier_column_name = '',
legend_prefix = 'has_MYC',
colors = get_colors(1, 'grey'),
wildtype_name = 'wildtype',
AML = TRUE)
plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = mask,
wildtype_mask = wildtype_mask,
stratifier_column_name = '',
legend_prefix = 'has_MYC',
colors = get_colors(1, 'grey'),
wildtype_name = 'wildtype',
AML = FALSE)
set_notebook_plot_size(7, 4)
mask <- (dd_clinical$plus13 == 1)
wildtype_mask <- ! mask
plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = mask,
wildtype_mask = wildtype_mask,
stratifier_column_name = '',
legend_prefix = 'has_plus13',
colors = get_colors(1, 'grey'),
wildtype_name = 'wildtype',
AML = TRUE)
plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
mask = mask,
wildtype_mask = wildtype_mask,
stratifier_column_name = '',
legend_prefix = 'has_plus13',
colors = get_colors(1, 'grey'),
wildtype_name = 'wildtype',
AML = FALSE)